Text analysis: title and abstract of male and female speakers
Abstracts
data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date)
#skimr::skim(data)Excluding special events as round tables and discussions not related to a project or study presented by someone.
IDs <- c(154, 250, 211, 289, 230, 167, 212)
data <- data %>% filter(!id %in% IDs)Using abstracts in English (original or translated)
data <- data %>% filter(!is.na(abstract_english)) Number of abstracts per group
table(data$gender)##
## F M
## 99 136
table(data$position_cat,data$gender)##
## F M
## others 4 1
## postdoc 21 21
## professor 21 58
## student 52 56
Tidytext
text_tok <- data %>% dplyr::select(id,gender,position_cat, audience_n,
abstract_english, title_english) %>%
mutate(text = paste(title_english, abstract_english)) %>%
unnest_tokens(output=word,input=text)
stop_w <- tibble(word = stopwords("en"))
# remove stopwords
text <- text_tok %>%
anti_join(stop_w, by="word")%>% arrange(word)
# remove other non-words (numbersm characters) and stopwords
text <- text %>% slice(-c(1:285)) %>% # number and some symbols
filter(nchar(word)!=1) %>% # letters alone
filter(!word %in% c("mpas", "ÎŽ13c", "ÎČ", "can", "aff", "agb"< "al"))
# solving some simple plurals
plural <- c("actions","advances", "adaptations", "amphibians", "animals",
"ants","anurans","abundances","adjustments","adults","affects",
"applications","approaches", "bees","builds", "birds","palms",
"cerrados","challenges", "outputs", "queens", "techniques",
"continents","crops", "consequences", "questions",
"decisions","declines","determines","determinants", "defenses",
"dynamics","agroecosystems","benefits","biomes",
"economics", "ecosystems","environments", "experiences",
"forests","grasslands","cases","cells","changes","chances",
"genetics","gifts","gradients","guides","impacts",
"increases","interactions","lives",
"landscapes","males","mammals", "mangroves","models","movements",
"mutualisms","networks","neotropics",
"opilions","phenotypes","plants","projects","paths", "perspectives","allows","areas", "assemblages","assessments",
"populations","promotes","relationships", "relations",
"resources","responses","roads","services","skulls","snakes","seeds",
"spaces", "spiders","stages", "trees", "variations",
"threats", "characteristics", "climates","collaborations", "contexts")
text$word[text$word %in% plural] <-
substr(text$word[text$word %in% plural],
1,nchar(text$word[text$word %in% plural])-1)- Grouping similar words:
lemma <- rbind(c("adaptive", "adaptation"),
c("abilities","ability"),
c("advancement", "advance"),
c("abundant","abundance"),
c("academies","academic"),
c("academic","academic"),
c("absent","absence"),
c("activities","activity"),
c("accomplished","accomplish"),
c("accounting","account"),
c("agricultural", "agriculture"),
c("agro", "agriculture" ),
c("amazonia","amazon" ),
c("amazonian","amazon" ),
c("allowed","allow"),
c("allowing","allow"),
c("andean","andes"),
c("apply","application"),
c("analysed","analysis"),
c("analyzed","analysis"),
c("analyzing","analysis"),
c("analyses","analysis"),
c("analytic","analysis"),
c("analytical","analysis"),
c("applying","application"),
c("apidae","apis"),
c("arachnida","arachnid"),
c("argue","argument"),
c("basal", "basis"),
c("behavioral","behavior"),
c("behavioural","behavior"),
c("bignonieae", "bignoniaceae"),
c("biological", "biology"),
c("brazilian","brazil"),
c("brazil's","brazil"),
c("brazilâs","brazil"),
c("building","build"),
c("changing", "change"),
c("cnidarian", "cnidaria"),
c("caused", "cause"),
c("causes","cause"),
c("causing", "cause"),
c("coastal","coast"),
c("changed", "change"),
c("colour", "color"),
c("colors", "color"),
c("communities","community" ),
c("competitive", "competition"),
c("complexity", "complex"),
c("convergences", "convergence"),
c("convergent", "convergence"),
c("cordatus","cordata" ),
c("croplands","crop"),
c( "cultural", "culture"),
c("darwin's", "darwin"),
c("darwinian", "darwin"),
c("defensive", "defense"),
c("dependent","dependence"),
c("detecting","detection"),
c("determine", "determinant"),
c("developmental", "development"),
c("dispersers","dispersal"),
c("disturbed", "disturbance"),
c("diversification", "diversity"),
c("dragonflies", "dragonfly"),
c("drier", "drought"),
c("ecological", "ecology"),
c("ecologists", "ecology"),
c("endemic", "endemism"),
c("effectiveness", "efficiency"),
c("environmental", "environment"),
c("evolutionary", "evolution"),
c("expanding", "expansion"),
c("extinct", "extinction"),
c("facilitate", "facilitation"),
c("fisheries", "fishery"),
c("floral", "flora"),
c("floristic", "flora"),
c("forested", "forest"),
c("functional", "function"),
c("functionally", "function"),
c("functioning", "function"),
c("frequencies", "frequency"),
c("frequently", "frequency"),
c("frequent", "frequency"),
c("geographical", "geographic"),
c("heterogeneties", "heterogeneity"),
c("heterogeneous", "heterogeneity"),
c("histories", "history"),
c("integrated", "integration"),
c("intregating", "integration"),
c("integrative", "integration"),
c("invasive", "invasion"),
c("isotopic", "isotope"),
c("linking", "link"),
c("living", "live"),
c("mammalia", "mammal"),
c("managed", "manage"),
c("managers", "manage"),
c("mathematical", "mathematics"),
c("mates", "mating"),
c("mediated", "mediate"),
c("mechanistic", "mechanism"),
c("matrices", "matrix"),
c("migratory", "migration"),
c("mimicking", "mimicry"),
c("modeling", "model"),
c("mutualistic", "mutualism"),
c("natural", "nature"),
c("neotropical", "neotropic"),
c("northeastern", "northeast"),
c("occuring", "occur"),
c("onça", "onca"),
c("opiliones", "opilion"),
c("parasite", "parasitism"),
c("parent", "parenting"),
c("phylogenies", "phylogeny"),
c("phylogenetic", "phylogeny"),
c("phylogenomic", "phylogeny"),
c("pollinators", "pollination"),
c("protected", "protect"),
c("protective", "protect"),
c("rainfall", "rain"),
c("reconstructing", "reconstruction"),
c("regulatory", "regulation"),
c("regulates", "regulation"),
c("relation", "relationship"),
c("reproductive", "reproduction"),
c("restored", "restoration"),
c("robustness", "robust"),
c("scientific", "science"),
c("scientist", "science"),
c("sexy", "sexual"),
c("simulated", "simulation"),
c("societies", "society"),
c("social", "society"),
c("socio", "society"),
c("space", "spatial"),
c("spacio", "spatial"),
c("stabilize", "stability"),
c("stable", "stability"),
c("stories", "story"),
c("strategic", "strategy"),
c("strategies", "strategy"),
c("structured", "structure"),
c("structuring", "structure"),
c("studies", "study"),
c("studing", "study"),
c("sustainable", "sustainability"),
c("theories", "theory"),
c("theoretical", "theory"),
c("threatened", "threat"),
c("tropical", "tropic"),
c("vision", "visual")
)
lemma <- as.data.frame(lemma)
for (i in 1:dim(lemma)[1]){
text$word[text$word == lemma[i,1]] <- lemma[i,2]
}WORDS - all data
table(text$gender)##
## F M
## 10502 13215
table(text$position_cat ,text$gender)##
## F M
## others 260 137
## postdoc 2777 2481
## professor 2055 5093
## student 5319 5504
Mean number of words by title+abstract
text %>% count(id,gender) %>%
ggplot(aes(x=gender, y=n)) +
geom_violin() + geom_boxplot(width=0.2)+
ylab("Number of words in title + abtract") ggbeeswarm::geom_quasirandom(size=3, shape=21) ## geom_point: na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_quasirandom
20 more common workds
text %>%
count(word, sort = TRUE) %>%
top_n(20,n)%>%
kable()| word | n |
|---|---|
| species | 384 |
| ecology | 185 |
| forest | 174 |
| model | 157 |
| study | 157 |
| environment | 139 |
| evolution | 134 |
| landscape | 127 |
| population | 122 |
| area | 113 |
| diversity | 112 |
| community | 100 |
| male | 97 |
| plant | 97 |
| nature | 96 |
| different | 95 |
| change | 92 |
| patterns | 88 |
| present | 86 |
| animal | 82 |
| interaction | 82 |
Word cloud
textplot_wordcloud(x=dfm(tokens(text$word)))par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
col="#FCA532")Word frequencies by gender
props <- text %>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
mutate(abs.dif.p = abs(proportion_F-proportion_M),
rel.dif.p = pmax(proportion_F, proportion_M)/
pmin(proportion_F, proportion_M)) %>%
arrange(desc(abs.dif.p))
props$label <- NA
props$label[1:20] <- props$word[1:20]ggplot(props, aes(x=proportion_M,, y=proportion_F,
color=abs.dif.p)) +
geom_abline(color = "gray40", lty = 2) +
#geom_point(size=2.5, alpha=0.5)+
geom_jitter(size=2.5, alpha=0.2)+
geom_text_repel(aes(label=label), size=3.2)+
scale_x_log10(name="Male most used words",
labels = percent_format()) +
scale_y_log10(name="Female most used words",
labels = percent_format()) +
scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
labels=percent_format()) +
theme(legend.justification = c(1, -0.1), legend.position = c(1, 0)) # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq.jpg", height = 5, width=7)Words that are close to the dashed line have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.
Legend: absolute differences in the frequency of the word by males and females. Differences above 0.3% are also indicated in text.
Correlation of word frequeency use between gender:
cor.test(props$proportion_F, props$proportion_M)##
## Pearson's product-moment correlation
##
## data: props$proportion_F and props$proportion_M
## t = 71.063, df = 1615, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8581171 0.8817840
## sample estimates:
## cor
## 0.8704527
Highly correlated -> it means they tend to use the same frequency of main word
20 words with the largest differences in frequency
prop2 <- props %>% filter(!is.na(label)) %>%
arrange(desc(proportion_F), desc(proportion_M)) %>%
mutate(ntot = n_F + n_M) %>%
mutate(word = fct_reorder(word,(ntot),max),
proportion_F = proportion_F*-1) %>%
pivot_longer(2:3,names_to = "gender", values_to ="proportion")
ggplot(prop2, aes(x=proportion, y=word,fill=gender)) +
geom_col()+ ylab("") + xlab("Proportion")+
scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
labels=c("F", "M"))+
geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
linetype="dotted",
col="darkgray") +
scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
labels = c(0.02,0.01,0,0.01,0.02))ggsave("figures/abstract_wordFreq_barplot.jpeg", units="in", width=7, height=7, dpi=300)TF IDF
The statistic tf-idf is intended to measure how important a word is to a document in a collection (or corpus) of documents, for example, to one novel in a collection of novels or to one website in a collection of websites.
Calculating tf-idf attempts to find the words that are important (i.e., common) in a text, but not too common. Letâs do that now.
text_id <- text %>% count(gender, word) %>%
bind_tf_idf(word, gender, n) %>%
arrange(desc(tf_idf))10 âexclusiveâ words for each group
text_id$word <- as.factor(text_id$word)
text_id %>%
group_by(gender) %>%
arrange(desc(tf_idf)) %>%
top_n(10, tf_idf) %>%
ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gender, scales = "free") +
theme_minimal()WORDS - professors only data
textP <- text %>% filter(position_cat == "professor")
table(textP$gender)##
## F M
## 2055 5093
Mean number of words by abstract
textP %>% count(id,gender) %>%
ggplot(aes(x=gender, y=n)) +
geom_violin() + geom_boxplot(width=0.2)+
ggbeeswarm::geom_quasirandom(size=3, shape=21) 20 most commmon words
textP %>%
count(word, sort = TRUE) %>%
top_n(20,n)%>%
kable()| word | n |
|---|---|
| species | 90 |
| ecology | 68 |
| environment | 52 |
| evolution | 52 |
| population | 50 |
| plant | 43 |
| study | 42 |
| model | 41 |
| nature | 38 |
| ecosystem | 37 |
| change | 35 |
| diversity | 35 |
| pollination | 30 |
| research | 30 |
| society | 30 |
| biology | 29 |
| interaction | 29 |
| science | 29 |
| present | 26 |
| landscape | 25 |
| may | 25 |
| results | 25 |
Words Frequency by gender
propsP <- textP %>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
mutate(abs.dif.p = abs(proportion_F-proportion_M),
rel.dif.p = pmax(proportion_F, proportion_M)/
pmin(proportion_F, proportion_M)) %>%
arrange(desc(abs.dif.p))
propsP$label <- NA
propsP$label[1:20] <- propsP$word[1:20]ggplot(propsP, aes(x=proportion_M, y=proportion_F,
color=abs.dif.p)) +
geom_abline(color = "gray40", lty = 2) +
# geom_point(size=2.5, alpha=0.3) +
geom_jitter(size=2.5, alpha=0.3)+
geom_text_repel(aes(label=label), size=3)+
scale_x_log10(name="Male most used words", limits=c(0.0003,0.02),
labels = percent_format()) +
scale_y_log10(name="Female Most used words", limits=c(0.0003,0.02),
labels = percent_format()) +
scale_color_gradient(name="Abs Diff",low = "blue", high = "red",
labels=percent_format()) +
theme(legend.justification = c(1, -0.1), legend.position = c(1, 0)) # geom_smooth(method="lm")
ggsave("figures/abstract_wordFreq_Prof.jpg", height = 5, width=7)Words that are close to the dashed line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.
Legend: absolute differences in the frequency of the word by males and females.
Labels for the 20 words with largest differences in frequency.
Correlation of word frequeency use between gender:
cor.test(propsP$proportion_F, propsP$proportion_M)##
## Pearson's product-moment correlation
##
## data: propsP$proportion_F and propsP$proportion_M
## t = 20.749, df = 548, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6137225 0.7076573
## sample estimates:
## cor
## 0.6632945
20 words with the largest differences in frequency
propP2 <- propsP %>% filter(!is.na(label)) %>%
arrange(desc(proportion_F), desc(proportion_M)) %>%
mutate(ntot = n_F + n_M) %>%
mutate(word = fct_reorder(word,(ntot),max),
proportion_F = proportion_F*-1) %>%
pivot_longer(2:3,names_to = "gender", values_to ="proportion")
ggplot(propP2, aes(x=proportion, y=word,fill=gender)) +
geom_col()+ ylab("") + xlab("Proportion")+
scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
labels=c("F", "M"))+
geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
linetype="dotted",
col="darkgray") +
scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
labels = c(0.02,0.01,0,0.01,0.02))ggsave("figures/abstract_wordFreq_barplot_Prof.jpeg", units="in", width=7, height=7, dpi=300)TF IDF
text_idP <- textP %>% count(gender, word) %>%
bind_tf_idf(word, gender, n) %>%
arrange(desc(tf_idf))10 âexclusiveâ words for each group
text_idP$word <- as.factor(text_idP$word)
text_idP %>%
group_by(gender) %>%
arrange(desc(tf_idf)) %>%
top_n(10, tf_idf) %>%
ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~gender, scales = "free") +
theme_minimal()Topic model - all data
LDA - latent Dirichlet allocation method for fiting topic model
It treats each document as a mixture of topics, and each topic as a mixture of words. This allows documents to âoverlapâ each other in terms of content, rather than being separated into discrete groups, in a way that mirrors typical use of natural lanâ guage.
Every document is a mixture of topics
Every topic is a mixture of words
matext <- text %>% count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
select(-gender) %>%
cast_dtm(term=word,document=id,value=n)Choosing number of topics: comparing AIC
ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
ap_lda5 <- LDA(matext, k = 5, control = list(seed = 1234))
ap_lda10<- LDA(matext, k = 10, control = list(seed = 1234))
ap_lda20 <- LDA(matext, k = 20, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,ap_lda5, ap_lda10,ap_lda20,
base=T)## AIC dAIC df
## ap_lda2 364708.5 0.0 9539
## ap_lda3 367793.6 3085.1 14308
## ap_lda4 370805.7 6097.2 19077
## ap_lda5 375098.1 10389.6 23846
## ap_lda10 406169.0 41460.5 47691
## ap_lda20 481100.3 116391.8 95381
two-topics model seems the most plausible model
Word-topic probabilities
10 words with the largest probabilities for each group
ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip() words with the greates difference in Beta between topics
beta_spread <- ap_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread## # A tibble: 256 Ă 4
## term topic1 topic2 log_ratio
## <chr> <dbl> <dbl> <dbl>
## 1 abundance 0.00179 0.000769 -1.22
## 2 action 0.00180 0.000135 -3.73
## 3 activity 0.000755 0.00148 0.970
## 4 adaptation 0.000327 0.00258 2.98
## 5 addition 0.00121 0.000884 -0.447
## 6 affect 0.00107 0.00121 0.174
## 7 agriculture 0.00154 0.000775 -0.992
## 8 allow 0.00108 0.00148 0.458
## 9 along 0.0000916 0.00106 3.53
## 10 also 0.00315 0.00255 -0.310
## # ⊠with 246 more rows
beta_spread %>%
arrange(log_ratio) %>% slice(c(1:5,260:264)) %>%
ggplot(aes(fct_reorder(term,log_ratio,min), log_ratio)) +
geom_col(show.legend = FALSE) + coord_flip() +
ylab("Log2 ration of beta in topic 2/topic 1") + xlab("Word")Document-topic probabilities - classifying the abstracts
and comparing the two groups by gender (if there is a difference in frequency)
ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
group_by(document,gender) %>%
top_n(1, gamma)
table(classifi$gender, classifi$topic)##
## 1 2
## F 54 45
## M 65 71
classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>% kable()| gender | 1 | 2 |
|---|---|---|
| F | 55% (54) | 45% (45) |
| M | 48% (65) | 52% (71) |
classifi %>%
# mutate(title = reorder(title, gamma * topic)) %>%
ggplot(aes(as.character(topic), gamma)) +
geom_boxplot() +
facet_wrap(~ gender)Chi-square test
chisq.test(classifi$gender, classifi$topic)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: classifi$gender and classifi$topic
## X-squared = 0.79212, df = 1, p-value = 0.3735
Topic model - Professors only
matextP <- textP %>%
count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
select(-gender) %>%
cast_dtm(term=word,document=id,value=n)ap_lda2P <- LDA(matextP, k = 2, control = list(seed = 1234))
ap_lda3P <- LDA(matextP, k = 3, control = list(seed = 1234))
ap_lda4P <- LDA(matextP, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2P, ap_lda3P, ap_lda4P,base=T)## AIC dAIC df
## ap_lda2P 107963.8 0.0 4879
## ap_lda3P 109691.5 1727.7 7318
## ap_lda4P 112088.4 4124.6 9757
word-topic probabilities
ap_topicsP <- tidy(ap_lda2P, matrix = "beta")
ap_top_termsP <- ap_topicsP %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
ap_top_termsP %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip() words with the greates difference in Beta between topics
beta_spread <- ap_topicsP %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread## # A tibble: 371 Ă 4
## term topic1 topic2 log_ratio
## <chr> <dbl> <dbl> <dbl>
## 1 ability 0.000941 1.02e- 3 0.114
## 2 abundance 0.000844 2.24e- 3 1.41
## 3 academic 0.00139 1.14e-43 -133.
## 4 accepted 0.00139 1.65e-44 -136.
## 5 account 0.00111 5.62e- 4 -0.985
## 6 across 0.000278 1.97e- 3 2.82
## 7 action 0.00209 4.20e- 4 -2.32
## 8 activity 0.00275 3.16e- 4 -3.12
## 9 adaptation 0.00111 4.22e- 3 1.92
## 10 advance 0.000278 1.69e- 3 2.60
## # ⊠with 361 more rows
beta_spread %>%
arrange(log_ratio) %>% slice(c(1:5,260:264)) %>%
ggplot(aes(fct_reorder(term,log_ratio,min), log_ratio)) +
geom_col(show.legend = FALSE) + coord_flip() +
ylab("Log2 ration of beta in topic 2/topic 1") + xlab("Word")Document-topic probabilities
ap_documentsP <- tidy(ap_lda2P, matrix = "gamma")
classifiP <- ap_documentsP %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
group_by(document,gender) %>%
top_n(1, gamma)
table(classifiP$gender, classifiP$topic)##
## 1 2
## F 14 7
## M 30 28
library(janitor)
classifiP %>% tabyl(gender, topic) %>% adorn_percentages() %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>% kable()| gender | 1 | 2 |
|---|---|---|
| F | 67% (14) | 33% (7) |
| M | 52% (30) | 48% (28) |
classifiP %>%
# mutate(title = reorder(title, gamma * topic)) %>%
ggplot(aes(as.character(topic), gamma)) +
geom_boxplot() +
geom_violin()+
facet_wrap(~ gender)Chi-square test
chisq.test(classifiP$gender, classifiP$topic)##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: classifiP$gender and classifiP$topic
## X-squared = 0.85524, df = 1, p-value = 0.3551
Sentiment analysis
Chapter 2, Silge & RObinson. 2018
- The NRC lexicon categorizes words in a binary fashion (âyesâ/ânoâ) into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust.
get_sentiments("nrc")## # A tibble: 13,875 Ă 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ⊠with 13,865 more rows
- The Bing lexicon categorizes words in a binary fashion into positive and negative categories.
get_sentiments("bing")## # A tibble: 6,786 Ă 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ⊠with 6,776 more rows
- The AFINN lexicon assigns words with a score that runs between -5 and 5, with negâ ative scores indicating negative sentiment and positive scores indicating positive senâ timent.
get_sentiments("afinn")## # A tibble: 2,477 Ă 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ⊠with 2,467 more rows
PENSAR: tem que levar em conta nĂșmero de palavras diferentes entre abstracts - principalmente se ouver diferença mĂ©dia de nĂșmero de palavras por abstract de homens e mulehres nĂ©? ou nĂŁo?
Score words difference in female and male abstracts
All data
affword <- get_sentiments("afinn")
affc <- text %>%
count(id, gender, word, sort = TRUE) %>%
inner_join(affword, "word")Calculating the mean of the scores for each abtract (weighted by number of time the word appears) by gender:
affc2 <- affc %>% group_by(id, gender) %>%
summarise(sum = sum(value*n),
mean.score = mean(value),
weig.score = weighted.mean(value,n))
ggplot(affc2, aes(x=gender,y=weig.score)) +
geom_violin() +
geom_boxplot(width=0.1) +
geom_quasirandom()+
ggtitle("Mean words score per abstract and gender")ggplot(affc2, aes(x=gender,y=sum)) +
geom_violin() +
geom_boxplot(width=0.1) +
geom_quasirandom()+
ggtitle("SUM words score per abstract and gender")Professors
affword <- get_sentiments("afinn")
affcP <- textP %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(affword, "word")Calculating the mean of the scores for each abtract (weighted by number of time the word appears) by gender:
affc2P <- affcP %>% group_by(id, gender) %>%
summarise(sum = sum(value*n),
mean.score = mean(value),
weig.score = weighted.mean(value,n))
ggplot(affc2P, aes(x=gender,y=weig.score)) +
geom_violin() +
geom_boxplot(width=0.1) +
geom_quasirandom()+
ggtitle("Mean words score per abstract and gender")ggplot(affc2P, aes(x=gender,y=sum)) +
geom_violin() +
geom_boxplot(width=0.1) +
geom_quasirandom()+
ggtitle("SUM words score per abstract and gender")Frequency of sentiment words per abstract
As classificaçÔes das palavras nĂŁo me parecem muito acuradas com a linguagem cientĂfica.
Precisa saber como ponderar pelo total de palavras.
All data
nrcword <- get_sentiments("nrc")
nrc <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(nrcword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(nrc, aes(x=gender, y=n)) +
facet_wrap(~sentiment) +
geom_violin() +
geom_quasirandom()text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(nrcword, "word") %>%
filter(sentiment %in% c("positive", "negative")) %>%
group_by(gender,sentiment, word) %>%
summarise(n= sum(n)) %>%
group_by(gender, sentiment) %>% top_n(5,n) %>%
ggplot(aes(word, n, fill=sentiment)) + geom_col(show.legend = FALSE) +
facet_grid(sentiment~ gender, scales = "free") + coord_flip()Professors
nrcword <- get_sentiments("nrc")
nrc <- textP %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(nrcword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(nrc, aes(x=gender, y=n)) +
facet_wrap(~sentiment) +
geom_violin()+
geom_quasirandom()nrc %>% filter(sentiment == "positive") %>%
ggplot( aes(x=gender, y=n)) +
geom_violin() +
geom_boxplot(width=0.2) +
geom_quasirandom()+
ggtitle("Positive words")textP %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(nrcword, "word") %>%
filter(sentiment %in% c("positive", "negative")) %>%
group_by(gender,sentiment, word) %>%
summarise(n= sum(n)) %>%
group_by(gender, sentiment) %>% top_n(5,n) %>%
ggplot(aes(word, n, fill=sentiment)) + geom_col(show.legend = FALSE) +
facet_grid(sentiment~ gender, scales = "free") + coord_flip()Frequency of sentiment words per abstract
All data
bingword <- get_sentiments("bing")
bing <- text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(bingword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(bing, aes(x=sentiment, y=n)) +
facet_wrap(~gender) +
geom_violin()+
geom_quasirandom()most common positive and negative words by gender
text %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(bingword, "word") %>%
group_by(gender, sentiment, word) %>%
summarise(n=sum(n)) %>%
group_by(gender, sentiment) %>% top_n(5,n) %>%
ggplot(aes(word, n, fill=sentiment)) + geom_col(show.legend = FALSE) +
facet_grid(sentiment~ gender, scales = "free") + coord_flip()Professors
bingword <- get_sentiments("bing")
bing <- textP %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(bingword, "word") %>%
group_by(id,gender,sentiment) %>%
summarise(n= sum(n))
ggplot(bing, aes(x=gender, y=n)) +
facet_wrap(~sentiment) +
geom_violin() +
geom_boxplot(width=0.2) +
geom_quasirandom()textP %>%
count(id,gender,word, sort = TRUE) %>%
inner_join(bingword, "word") %>%
group_by(gender, sentiment, word) %>%
summarise(n=sum(n)) %>%
group_by(gender, sentiment) %>% top_n(5,n) %>%
ggplot(aes(word, n, fill=sentiment)) + geom_col(show.legend = FALSE) +
facet_grid(sentiment~ gender, scales = "free") + coord_flip()